library(tidyverse)## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.0 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.1 ✔ tibble 3.2.0
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(plotly)##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
ggplot(data = iris, aes(x = Sepal.Length, y = Sepal.Width)) +
geom_point(aes(color=Species, shape=Species)) +
labs(title = "Iris Sepal Length vs Wide", x = "Sepal Length", y = "Sepal Width", color = "Plant Species", shape = "Plant Species")ggplot(data = iris, aes(x = Sepal.Length, y = Sepal.Width)) +
geom_point(aes(color=Species, shape=Species)) +
labs(title = "Iris Sepal Length vs Wide", x = "Sepal Length", y = "Sepal Width", color = "Plant Species", shape = "Plant Species") +
theme_classic()iris_example_plot1 <- ggplot(data = iris, aes(x = Sepal.Length, y = Sepal.Width)) +
geom_point(color = "red", aes(shape = Species))+
labs(title = "Iris Sepal Length vs Wide", x = "Sepal Length", y = "Sepal Width") ggplot(data = iris, aes(x = Sepal.Length, y = Sepal.Width)) +
geom_point(aes(color = Species, shape = Species)) +
scale_color_manual(values=c("blue", "purple", "red")) +
labs(title = "Iris Sepal Length vs Wide", x = "Sepal Length", y = "Sepal Width") iris_example_plot2 <-ggplot(data = iris, aes(x = Sepal.Length, y = Sepal.Width)) +
geom_point(aes(color = Species, shape = Species)) +
scale_color_brewer(palette="Dark2") +
labs(title = "Iris Sepal Length vs Wide", x = "Sepal Length", y = "Sepal Width") library(viridisLite)
ggplot(data = iris, aes(x = Sepal.Length, y = Sepal.Width)) +
geom_point(aes(fill = Species), color = "black", pch=21) +
labs(title = "Iris Sepal Length vs Wide", x = "Sepal Length", y = "Sepal Width") library(viridisLite)
ggplot(data = iris, aes(x = Sepal.Length, y = Sepal.Width)) +
geom_point(aes(color = Species, shape = Species)) +
scale_colour_viridis_d() +
labs(title = "Iris Sepal Length vs Wide", x = "Sepal Length", y = "Sepal Width") You can export as a pdf, svg, tiff, png, bmp, jpeg, and eps
# Plot graph to a pdf outputfile
#pdf("images/iris_example_plot1.pdf", width=6, height=3)
ggplot(data = iris, aes(x = Sepal.Length, y = Sepal.Width, color = Species)) +
geom_point() +
labs(title = "Iris Sepal Length vs Wide", x = "Sepal Length", y = "Sepal Width") dev.off()## null device
## 1
# Plot graph to a png outputfile
ppi <- 300
#png("images/iris_example_plot2.png", width=6*ppi, height=4*ppi, res=ppi)
ggplot(data = iris, aes(x = Sepal.Length, y = Sepal.Width, color = Species)) +
geom_point()
dev.off()## null device
## 1
library(plotly)
# Version 1
ggplotly(
ggplot(data = iris, aes(x = Sepal.Length, y = Sepal.Width, color = Species)) +
geom_point()
)# Version 2
p <- ggplot(data = iris, aes(x = Sepal.Length, y = Sepal.Width, color = Species)) +
geom_point()
ggplotly(p)NEON_MAGs <- read_csv("data/GOLD_NEON.csv") %>%
# remove columns that are not needed for data analysis
select(-c(`GOLD Study ID`, `Bin Methods`, `Created By`, `Date Added`)) %>%
# create a new column with the Assembly Type
mutate("Assembly Type" = case_when(`Genome Name` == "NEON combined assembly" ~ `Genome Name`,
TRUE ~ "Individual")) %>%
mutate_at("Assembly Type", str_replace, "NEON combined assembly", "Combined") %>%
separate(`GTDB-Tk Taxonomy Lineage`, c("Domain", "Phylum", "Class", "Order", "Family", "Genus"), "; ", remove = FALSE) %>%
# Get rid of the the common string "Soil microbial communities from "
mutate_at("Genome Name", str_replace, "Terrestrial soil microbial communities from ", "") %>%
# Use the first `-` to split the column in two
separate(`Genome Name`, c("Site","Sample Name"), " - ") %>%
# Get rid of the the common string "S-comp-1"
mutate_at("Sample Name", str_replace, "-comp-1", "") %>%
# separate the Sample Name into Site ID and plot info
separate(`Sample Name`, c("Site ID","subplot.layer.date"), "_", remove = FALSE,) %>%
# separate the plot info into 3 columns
separate(`subplot.layer.date`, c("Subplot", "Layer", "Date"), "-") ## Rows: 1754 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): Bin ID, Genome Name, Bin Quality, Bin Lineage, GTDB-Tk Taxonomy L...
## dbl (10): IMG Genome ID, Bin Completeness, Bin Contamination, Total Number ...
## date (1): Date Added
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Warning: Expected 6 pieces. Additional pieces discarded in 29 rows [12, 32, 66, 79, 80,
## 88, 96, 102, 104, 240, 334, 386, 657, 790, 846, 931, 943, 983, 1041, 1095,
## ...].
## Warning: Expected 6 pieces. Missing pieces filled with `NA` in 429 rows [6, 7, 42, 49,
## 50, 55, 60, 83, 85, 97, 100, 105, 107, 113, 114, 116, 119, 125, 129, 130, ...].
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 624 rows [1131, 1132,
## 1133, 1134, 1135, 1136, 1137, 1138, 1139, 1140, 1141, 1142, 1143, 1144, 1145,
## 1146, 1147, 1148, 1149, 1150, ...].
Remove Archaea
NEON_MAGs_bact_ind <- NEON_MAGs %>%
filter(Domain == "Bacteria") %>%
filter(`Assembly Type` == "Individual") NEON_MAGs_bact_ind %>%
ggplot(aes(x = Phylum)) +
geom_bar() +
coord_flip()forcats puts counts in descending order
NEON_MAGs_bact_ind %>%
ggplot(aes(x = fct_infreq(Phylum))) +
geom_bar() +
coord_flip()or you can pipe it
NEON_MAGs_bact_ind %>%
count(Phylum) %>%
ggplot(aes(x = Phylum, y = n)) +
geom_col(stat = "identity") +
coord_flip()## Warning in geom_col(stat = "identity"): Ignoring unknown parameters: `stat`
in descending order:
NEON_MAGs_bact_ind %>%
count(Phylum) %>%
ggplot(aes(x = reorder(Phylum, n), y = n)) +
geom_col(stat = "identity") +
coord_flip()## Warning in geom_col(stat = "identity"): Ignoring unknown parameters: `stat`
NEON_MAGs_bact_ind %>%
ggplot(aes(x = fct_rev(fct_infreq(Phylum)), fill = Site)) +
geom_bar() +
theme(legend.position = "bottom") +
theme(legend.justification = "left") +
theme(legend.key.size = unit( 0.1, 'cm')) +
theme(legend.key.height = unit(0.1, 'cm')) +
theme(legend.key.width = unit(0.1, 'cm')) +
theme(legend.title = element_text(colour = "black", size = 4, face = "bold")) +
theme(legend.text = element_text(colour = "black", size = 4)) +
theme(legend.box.background = element_rect()) +
theme(legend.box.margin = margin(4, 4, 4, 4)) +
theme(legend.box.just = "left") +
theme( axis.text.y = element_blank()) +
scale_y_continuous(n.breaks = 10) +
xlab("New Species") +
ylab("Count") +
labs(title = "Quality of New Species Data") +
coord_flip()NEON_MAGs_bact_ind %>%
ggplot(aes(x = fct_rev(fct_infreq(Phylum)), fill = Site)) +
geom_bar(position = "dodge") +
theme(legend.position = "bottom") +
theme(legend.justification = "left") +
theme(legend.key.size = unit( 0.1, 'cm')) +
theme(legend.key.height = unit(0.1, 'cm')) +
theme(legend.key.width = unit(0.1, 'cm')) +
theme(legend.title = element_text(colour = "black", size = 4, face = "bold")) +
theme(legend.text = element_text(colour = "black", size = 4)) +
theme(legend.box.background = element_rect()) +
theme(legend.box.margin = margin(4, 4, 4, 4)) +
theme(legend.box.just = "left") +
theme( axis.text.y = element_blank()) +
scale_y_continuous(n.breaks = 10) +
xlab("New Species") +
ylab("Count") +
labs(title = "Quality of New Species Data") +
coord_flip()
Bar width can be adjusted
NEON_MAGs_bact_ind %>%
ggplot(aes(x = fct_rev(fct_infreq(Phylum)), fill = Site)) +
geom_bar(position = position_dodge2(width = 0.9, preserve = "single")) +
coord_flip()NEON_MAGs_bact_ind %>%
ggplot(aes(x = Phylum)) +
geom_bar(position = position_dodge2(width = 0.9, preserve = "single")) +
coord_flip() +
facet_wrap(vars(Site), scales = "free", ncol = 2)NEON_MAGs_bact_ind %>%
ggplot(aes(x = `Total Number of Bases`)) +
geom_histogram(bins = 50) NEON_MAGs_bact_ind %>%
ggplot(aes(x = fct_infreq(Phylum), y = `Total Number of Bases`)) +
geom_boxplot() +
theme(axis.text.x = element_text(angle=45, vjust=1, hjust=1))NEON_MAGs_bact_ind %>%
ggplot(aes(x = fct_infreq(Phylum), y = `Total Number of Bases`)) +
geom_point() +
coord_flip()For all exercises make complete graphs that are report ready. Relabel the x-axis, y-axis and legend for clarity, add a title, add color and size appropriately. The NAs in the taxonomy indicate a novel species starting with the highest level. For example a NA in a class that has an assigned phylum Proteobacteria would be a novel class in the phylum Proteobacteria.
What are the overall class MAG counts?
The counts are displayed in the following graph:
NEON_MAGs_bact_ind %>%
count(Class) %>%
ggplot(aes(x = reorder(Class, n), y = n)) +
geom_bar(stat = "Identity", fill = "red") +
xlab("Class") +
ylab("Count") +
labs(title = "MAG Counts by Class") +
theme( axis.text.y = element_text(size = 4)) +
coord_flip()What are the MAG counts for each subplot? Color by site ID.
NEON_MAGs_bact_ind %>%
ggplot(aes(x = fct_rev(fct_infreq(Class)), fill = Site)) +
geom_bar(position = position_dodge2(width = 0.5, preserve = "single")) +
theme(legend.position = "bottom") +
theme(legend.justification = "left") +
theme(legend.key.size = unit( 0.1, 'cm')) +
theme(legend.key.height = unit(0.1, 'cm')) +
theme(legend.key.width = unit(0.1, 'cm')) +
theme(legend.title = element_text(colour = "black", size = 2, face = "bold")) +
theme(legend.text = element_text(colour = "black", size = 2)) +
theme(legend.box.background = element_rect()) +
theme(legend.box.margin = margin(4, 4, 4, 4)) +
theme(legend.box.just = "left") +
theme( axis.text.x = element_text(size = 4, angle = 90)) +
xlab("Class") +
ylab("Count") +
labs(title = "MAG Counts by Class and Site")How many novel bacteria were discovered? Show the number of NAs for each site
NEON_MAGs_bact_ind %>%
filter(is.na(Genus)) %>%
ggplot(aes(x = fct_rev(fct_infreq(Genus)), fill = Site)) +
geom_bar(position = position_dodge2(width = 0.5, preserve = "single")) +
theme(legend.position = "bottom") +
theme(legend.justification = "left") +
theme(legend.key.size = unit( 0.1, 'cm')) +
theme(legend.key.height = unit(0.1, 'cm')) +
theme(legend.key.width = unit(0.1, 'cm')) +
theme(legend.title = element_text(colour = "black", size = 4, face = "bold")) +
theme(legend.text = element_text(colour = "black", size = 2.2)) +
theme(legend.box.background = element_rect()) +
theme(legend.box.margin = margin(4, 4, 4, 4)) +
theme(legend.box.just = "left") +
theme( axis.text.y = element_blank()) +
xlab("New Species") +
ylab("Count") +
labs(title = "New Species at a Given Site") +
coord_flip()How many novel bacterial MAGs are high vs medium quality?
NEON_MAGs_bact_ind %>%
filter(is.na(Genus)) %>%
ggplot(aes(x = fct_rev(fct_infreq(Genus)), fill = `Bin Quality`)) +
geom_bar(position = position_dodge2(width = 0.9, preserve = "single")) +
theme(legend.position = "bottom") +
theme(legend.justification = "left") +
theme(legend.key.size = unit( 0.1, 'cm')) +
theme(legend.key.height = unit(0.1, 'cm')) +
theme(legend.key.width = unit(0.1, 'cm')) +
theme(legend.title = element_text(colour = "black", size = 4, face = "bold")) +
theme(legend.text = element_text(colour = "black", size = 4)) +
theme(legend.box.background = element_rect()) +
theme(legend.box.margin = margin(4, 4, 4, 4)) +
theme(legend.box.just = "left") +
theme( axis.text.y = element_blank()) +
scale_y_continuous(n.breaks = 10) +
xlab("New Species") +
ylab("Count") +
labs(title = "Quality of New Species Data") +
coord_flip()What phyla have novel bacterial genera?
NEON_MAGs_bact_ind %>%
filter(is.na(Genus)) %>%
ggplot(aes(x = fct_rev(fct_infreq(Genus)), fill = `Phylum`)) +
geom_bar(position = position_dodge2(width = 0.9, preserve = "single")) +
theme(legend.position = "bottom") +
theme(legend.justification = "left") +
theme(legend.key.size = unit( 0.1, 'cm')) +
theme(legend.key.height = unit(0.1, 'cm')) +
theme(legend.key.width = unit(0.1, 'cm')) +
theme(legend.title = element_text(colour = "black", size = 4, face = "bold")) +
theme(legend.text = element_text(colour = "black", size = 4)) +
theme(legend.box.background = element_rect()) +
theme(legend.box.margin = margin(4, 4, 4, 4)) +
theme(legend.box.just = "left") +
theme( axis.text.y = element_blank()) +
scale_y_continuous(n.breaks = 20) +
xlab("New Genera") +
ylab("Count") +
labs(title = "Number of Novel Genera per Phylum") +
coord_flip()Make a stacked bar plot of the total number of MAGs at each site using phylum as the fill.
NEON_MAGs_bact_ind %>%
ggplot(aes(x = fct_rev(fct_infreq(Site)), fill = Phylum)) +
geom_bar() +
theme(legend.position = "bottom") +
theme(legend.justification = "left") +
theme(legend.key.size = unit( 0.1, 'cm')) +
theme(legend.key.height = unit(0.1, 'cm')) +
theme(legend.key.width = unit(0.1, 'cm')) +
theme(legend.title = element_text(colour = "black", size = 4, face = "bold")) +
theme(legend.text = element_text(colour = "black", size = 4)) +
theme(legend.box.background = element_rect()) +
theme(legend.box.margin = margin(4, 4, 4, 4)) +
theme(legend.box.just = "left") +
theme( axis.text.y = element_text(size = 4)) +
scale_y_continuous(n.breaks = 12) +
xlab("Site") +
ylab("Count") +
labs(title = "Number of MAGs per Site by Plyla") +
coord_flip()Using facet_wrap make plots of the total number of MAGS at each site for each phylum.
NEON_MAGs_bact_ind %>%
ggplot(aes(x = Site)) +
geom_bar(position = position_dodge2(width = 3, preserve = "single")) +
theme(axis.text.y = element_text(size = 2)) +
theme(axis.text.x = element_text(size = 4)) +
theme(axis.title.x = element_text(size = 6)) +
theme(axis.title.y = element_text(size = 6)) +
theme(axis.title.y.left = element_text(size = 2)) +
theme(strip.background = element_blank(),
strip.text = element_text(size = rel(0.3), margin = margin()),
panel.spacing = unit(3, "pt")) +
coord_flip() +
scale_y_continuous(n.breaks = 12) +
facet_wrap(vars(Phylum), scales = "fixed") +
xlab("Site") +
ylab("Total MAGs") +
labs(title = "Total number of MAGs per Site") What is the relationship between MAGs genome size and the number of genes? Color by phylum.
ggplot(data = NEON_MAGs_bact_ind, aes(x = `Total Number of Bases`, y = `Gene Count`, color = Phylum)) +
geom_point() +
theme(legend.position = "bottom") +
theme(legend.justification = "left") +
theme(legend.key.size = unit( 0.1, 'cm')) +
theme(legend.key.height = unit(0.1, 'cm')) +
theme(legend.key.width = unit(0.1, 'cm')) +
theme(legend.title = element_text(colour = "black", size = 4, face = "bold")) +
theme(legend.text = element_text(colour = "black", size = 4)) +
theme(legend.box.background = element_rect()) +
theme(legend.box.margin = margin(4, 4, 4, 4)) +
theme(legend.box.just = "left") +
theme( axis.text.y = element_text(size = 4)) +
scale_y_continuous(n.breaks = 12) +
xlab("Total Bases") +
ylab("Gene Count") +
labs(title = "Genome Size and Gene Number by Phylum")
There is a strong positive correlation.
What is the relationship between scaffold count and MAG completeness?
ggplot(data = NEON_MAGs_bact_ind, aes(x = `Scaffold Count`, y = `Bin Completeness`,)) +
geom_point(color = "blue") +
theme( axis.text.y = element_text(size = 10)) +
scale_y_continuous(n.breaks = 12) +
xlab("Scaffold Count") +
ylab("Percent Complete") +
labs(title = "Scaffold Count and Bin Completeness") There seems to be a slight negative correlation.